/******************************************************************************/
/* Creating analysis data using 3-digit NAICS codes rather *******************/
******************************************************************************/


/***********************************************************/
/* Obtain national employment growth by industry-firm size */
/* Saves NationalIndustryFirmGrowth2digit.dta              */
/***********************************************************/
local Data "NationalIndustryFirmGrowth3digit"

noi disp "Obtaining national employment growth by firm-size size data..."
noi disp "Verifying that `Data' does not already exist..."
* This returns an error if the file does not exist.
capture confirm file "`Data'.dta"

* If an error is returned, then the commands below are run.
if _rc == 601 {
		
	noi disp "Data not found.  Building data..."
	noi disp "Loading National 3 digit employment.csv..."
	*Obtain National Employment Growth by Industry-Firmsize
	insheet using "$SourceData\National 3 digit employment.csv", clear
	noi disp "Finished loading."

	* THIS WAS LOWER. WHICH IS CORRECT?
	*Excluding Finance Industry and NAICS92, the latter because there is no data for large v. small
	drop if floor(industry/10)==52 | floor(industry/10)==92

	* Verify that there are no duplicates
	isid geography industry year firmsize

	* Keep only if the first year observed for each county is 2000.
	bysort geography: egen minyear = min(year)
	keep if minyear==2000
	drop minyear
	* Keep only if the last year observed for each county is 2017.
	bysort geography: egen maxyear = max(year)
	keep if maxyear==2017
	drop maxyear

	sort  industry year firmsize
		collapse (sum) emp, by(industry year firmsize)	
		bysort industry year: egen totemp = sum(emp)
	label var emp "Total employment for the industry, year, and firm size"
		
	/* Combining Lowest Three Employment Categories into one */
	gen Firm123 = emp if firmsize==1 | firmsize==2 | firmsize==3
	* Finds total of all firms of size 1, 2, and 3
	bysort industry year: egen TotalFirm123 = sum(Firm123)
	drop Firm123 
	* Replace observation from firm size 1 with total of firms 1, 2, and 3
	replace emp = TotalFirm123 if firmsize==1
	drop TotalFirm123 
	* Keep only two sizes: Firms 1,2,3 and Firms 5
	keep if firmsize==1 | firmsize==5 


	gen sizeshare = emp/totemp
	label var sizeshare "Share for a size, industry, and year"

	sort industry firmsize year
	* Generate the percentage change in employment
	by industry firmsize : gen g_nat_ind_size = ln(emp) - ln(emp[_n-1]) 
	label var g_nat_ind_size "Growth in national industry employment, by size"

	* Generate the difference in the share of employment
	by industry firmsize : gen d_nat_ind_share = sizeshare - sizeshare[_n-1]
	label var d_nat_ind_share "Change in national industry employment share, by size"

	
		
	*First Observation by Industry
		sort industry firmsize year
		by industry firmsize : gen sizeshare1_2000 = sizeshare[1] 	
		gen dn_i_sizeshare1_t_00 = sizeshare - sizeshare1_2000

		by industry : gen emp_2000 = totemp[1] 	
		gen n_i_gr_t_00 = (totemp - emp_2000)/emp_2000

	*Plots of Size-Share by Industry
		
	/*
		twoway (line dn_i_sizeshare1_t_00 year if firmsize==1, yaxis(1))(line n_i_gr_t_00 year if firmsize==1, yaxis(2)), ytitle("Change in Size Share (Left), Emp Growth (Right)") legend(lab(1 "Chg Small Firm Share") lab(2 "Pct Chng Emp")) by(industry)
		graph export "$LocalData\Results\US_industry_SmallShare.png", replace
	*/
	
	
	
	* Keep only what is needed, and reshape to match diagnostic code format.
	keep industry firmsize year g_nat_ind_size d_nat_ind_share sizeshare totemp
	sort industry year
	reshape wide g_nat_ind_size d_nat_ind_share sizeshare, i(industry year) j(firmsize)

	
	noi disp "Saving `Data'.dta"
	compress
	save "$LocalData\\`Data'.dta", replace
	save "$LocalData\Archive\\`Data'`CurrentDate'.dta", replace
}
* This runs if no error was returned.
else noi disp "Data already exists."
noi etime
noi disp " "
	
	
		



	
/***********************************************************/
/* Obtain national employment growth by industry-firm size */
/***********************************************************/
local Data "CountyIndustryEmp_3digit"

noi disp "Obtaining county industrial data..."
noi disp "Verifying that `Data'.dta does not already exist..."
* This returns an error if the file does not exist.
capture confirm file `Data'.dta

* If an error is returned, then the commands below are run.
if _rc == 601 {
noi disp "Data not found.  Building data..."
noi disp "Loading 3_digit_q2.csv..."
insheet using "$SourceData\3_digit_q2.csv", clear
noi disp "Finished loading data"
noi disp "$LocalData"

*Keeping Relevant (not 52, 92, aggregate, state) County-Industries for Analysis
keep if floor(industry/10)!=52 & floor(industry/10)!=92 
	/*If wanting to restrict to only those counties with material (i.e. non-censored) employment shares for every industry
	gen temp = cond(missing(emp),1,0)
	bysort geography year: egen empty = max(temp)
	keep if empty==0
	drop empty temp
	*/


noi disp "Keeping only county-industy pairs that have observations in both 2000."

*Keeping only data that exist in 2000
keep if year==2000

*Note that in removing industry 52 (and 92) we cannot rely on county aggregates for any employment numbers across industries
*Summing up county-industry employment
*Note: we are setting to "0" the empties by necessity of construction of Bartik. For smaller counties, this is likely a bigger issue for computing industry shares.
*If we restrict to only counties with reported values for each industry-county, we are left with ~300 counties or so. (I think results are robust to this). However, we could not extend this approach to 3-digit NAICS codes.
* Generate county-industry employment (c_i_emp)
noi gen c_i_emp = emp
noi replace c_i_emp = 0 if missing(c_i_emp)
label var c_i_emp "County-Industry Employment"

* Generate total county employment (c_emp)
bysort geography: egen c_emp = sum(c_i_emp)
label var c_i_emp "Total County Employment"
* Drop if there is no employment in the county
noi drop if c_emp <= 0

/*
** IS THIS NEEDED?
* Generate total national employment (n_emp)
egen n_emp = sum(c_i_emp)
label var c_i_emp "Total National Employment"
*/
	
* Generating County-Industy Shares in 2000
gen c_i_share_2000 = c_i_emp/c_emp

* Check to insure every county's shares sum to 1.
bysort geography: egen total = sum(c_i_share_2000)
* Due to rounding, some are negligibly less than 1 (0.9999999)
assert total > 0.99
drop total /* Not needed once check is complete */

keep geography industry c_i_share_2000 c_emp
		
*Merging In County Size Shares
* Join by geography
sort geography
* Combines datasets horizontally but forming all pairwise combinations within county (geography)
joinby geography using "$LocalData\CountyYearFirmSizes.dta"

*Merging In National Industry-Firmsize Growth Rates
sort industry year
merge m:1 industry year using "$LocalData\NationalIndustryFirmGrowth3digit.dta"
drop _merge
		
	
****************************************************************************************
*Construction of Bartik Instrument******************************************************
****************************************************************************************
	
* Bartik Instrument: national industry growth for small and large firms, weighted by county-industry shares
noi disp "Using asgen to generate industry growth, by size, weighted by county-industry shares."
	bysort geography year: asgen bartikGL1 = g_nat_ind_size5, weight(c_i_share_2000)
	bysort geography year: asgen bartikGS1 = g_nat_ind_size1, weight(c_i_share_2000)
	bysort geography year: asgen bartikL1 = d_nat_ind_share5, weight(c_i_share_2000)
	bysort geography year: asgen bartikS1 = d_nat_ind_share1, weight(c_i_share_2000)
		

noi disp "Obtaining industry match data..."
noi disp "Verifying that this data does not already exist..."
* This returns an error if the file does not exist.
capture confirm file IndustryMatch3.dta

* If an error is returned, then the commands below are run.
if _rc == 601 {
	noi disp "Data not found.  Building data..."
	noi disp "Building IndustryMatch.dta..."
	preserve 
	
	insheet using "$SourceData\NAIC 3 digit.csv", clear

	
	rename v1 industry
	rename v2 IndLabel

	*Defining Industry Names	
 	bysort industry: gen obs= _n
 	keep if obs==1
	drop obs
	
	compress
 	save "$LocalData\IndustryMatch3.dta", replace
	save "$LocalData\Archive\IndustryMatch3`CurrentDate'.dta", replace
	restore
	
}
* This runs if no error was returned.
else noi disp "Data already exists."
noi etime
noi disp " "


noi disp "Obtaining initial shares data..."
noi disp "Verifying that this data does not already exist..."
* This returns an error if the file does not exist.
capture confirm file InitialShares2000_3digit.dta

* If an error is returned, then the commands below are run.
if _rc == 601 {
	noi disp "Data not found.  Building data..."
	noi disp "Building InitialShares2000_3digit.dta..."

	*Setting up Wide data to merge in Initial Shares and National Industry Firm-Size Growth Rates as columns into county-year data structure
	preserve 
	keep year geography c_i_share_2000 industry g_nat_ind_size1 g_nat_ind_size5 d_nat_ind_share1 d_nat_ind_share5
	drop if industry==. | floor(industry/10)==52
	replace c_i_share_2000 = 0 if missing(c_i_share_2000)
	rename g_nat_ind_size1 g_nat_ind_size1_
	rename g_nat_ind_size5 g_nat_ind_size5_
	rename d_nat_ind_share1 d_nat_ind_share1_
	rename d_nat_ind_share5 d_nat_ind_share5_
	*Renaming NAICS 2-digit codes to allow for destringing (necessary for reshape function)
	destring industry, replace
	keep if !missing(industry)
	reshape wide c_i_share_2000 g_nat_ind_size1_ g_nat_ind_size5_ d_nat_ind_share1_ d_nat_ind_share5_, i(geography year) j(industry)
	
	local ind_stub c_i_share_2000
	foreach cshare of varlist `ind_stub'* {
		bysort geography: egen temp = max(`cshare')
		replace `cshare' = temp if missing(`cshare')
		drop temp
		replace `cshare' = 0 if missing(`cshare')
	}
	
	local growth g_nat_ind_size1_ 
	foreach g of varlist `growth'* {
		bysort year: egen temp = max(`g')
		replace `g' = temp if missing(`g')
		drop temp
		replace `g' = 0 if missing(`g')
	}
	
	local growth g_nat_ind_size5_ 
	foreach g of varlist `growth'* {
		bysort year: egen temp = max(`g')
		replace `g' = temp if missing(`g')
		drop temp
		replace `g' = 0 if missing(`g')
	}
	
	local growth d_nat_ind_share1_ 
	foreach g of varlist `growth'* {
		bysort year: egen temp = max(`g')
		replace `g' = temp if missing(`g')
		drop temp
		replace `g' = 0 if missing(`g')
	}
	
	local growth d_nat_ind_share5_ 
	foreach g of varlist `growth'* {
		bysort year: egen temp = max(`g')
		replace `g' = temp if missing(`g')
		drop temp
		replace `g' = 0 if missing(`g')
	}

	compress
	save "$LocalData\InitialShares2000_3digit.dta", replace
	save "$LocalData\Archive\InitialShares2000_3digit_`CurrentDate'.dta", replace

	restore

	
}
* This runs if no error was returned.
else noi disp "Data already exists."
noi etime
noi disp " "

		
/******************************************************************************/
/* Generating County Industrial Dataset                                       */
/******************************************************************************/


	bysort geography year: gen obs2 = _n 
	keep if obs2==1
	keep geography year c_emp bartikGS* bartikGL* bartikS* bartikL* gSmallFirm* gLargeFirm* dSmallShare* dLargeShare* emp_size1 emp_size5
	
	sort geography year
	compress

	merge 1:1 geography year using "$LocalData\InitialShares2000_3digit.dta"
	drop _merge

	compress
	save "$LocalData\\`Data'.dta", replace
	save "$LocalData\Archive\\`Data'`CurrentDate'.dta", replace

}
* This runs if no error was returned.
else noi disp "Data already exists."
noi etime
noi disp " "



	
*******************************************************************************
****Generating "Bartik Analytic Data    ***************************************
*******************************************************************************

noi disp "Obtaining initial shares data..."
noi disp "Verifying that BartikAnalyticData_3digit.dta does not already exist..."
* This returns an error if the file does not exist.
capture confirm file BartikAnalyticData_3digit.dta

* If an error is returned, then the commands below are run.
if _rc == 601 {
	noi disp "Data not found.  Building data..."
	noi disp "Building BartikAnalyticData_3digit.dta..."


	noi disp "Loading CountyBanking.dta..."
	use "$LocalData\CountyBanking.dta", clear	
	noi disp "Merging CountyIndustryEmp_3digit.dta..."
	sort geography year
	merge geography year using "$LocalData\CountyIndustryEmp_3digit.dta"
	drop _merge
	sort geography year

	noi disp "Merging Demographic.dta..."
	merge geography year using "$LocalData\Demographic.dta"

	sort geography year
	*Checking unique geography years. 
	by geography year: gen nobs = _N
	su nobs
	drop nobs
	
	*Initial demographic data 
	keep if year>=2000
	*Restrict to observations with first observations in 2000. Lose 58 of 57,827 county-years.
	bysort geography: egen minyear = min(year)
	keep if minyear==2000
	drop minyear
	by geography: gen br2000 = cnty_br[1]
	by geography: gen emp2000 = c_emp[1]
	by geography: gen income2000 = income[1]
*	by geography: gen small_dep_share2000 = small_dep_share[1]
*	by geography: gen large_dep_share2000 = large_dep_share[1]
*	by geography: gen hhi_dep2000 = hhi_dep[1]
	
	egen empsize_total = rowtotal(emp_size*)
	gen c_sizeshare1 = emp_size1/empsize_total
	by geography: gen sm_size_share2000 = c_sizeshare1[3]
	
*	gen lnbr2000 = ln(br2000)
	
	*Keeping only counties observed in 2000

	drop if emp2000==. | br2000==.

	*Defining level variables as 1000s, where appropriate
	replace pop2000 = pop2000/1000
	replace income2000 = income2000/1000
	gen emp_2000 = emp2000/1000

	/*  NEEDED? */
	*Some state IDs got lost in the mergers	
	gen state=floor(geography/1000)
	
	drop _merge
		
	*Saving Analytical Dataset	
	compress
	save "$LocalData\BartikAnalyticData_3digit.dta", replace
	save "$LocalData\Archive\BartikAnalyticData_3digit`CurrentDate'.dta", replace

}
* This runs if no error was returned.
else noi disp "Data already exists."
noi etime
noi disp " "


